In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
import missingno as msno
import plotly.express as px
In [321]:
# Suppress warnings
warnings.filterwarnings("ignore")
# Load the dataset
def load_dataset(filepath):
try:
df = pd.read_csv(filepath)
print("Dataset loaded successfully.")
return df
except FileNotFoundError:
print("Error: File not found. Please check the file path.")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
In [322]:
def summarize_dataset(df):
print("First 5 rows of the dataset:")
print(df.head())
print(f"\nShape of the dataset: {df.shape}")
print("\nDataset Info:")
print(df.info())
print("\nMissing values in each column:")
print(df.isnull().sum())
In [323]:
# Visualize missing values
def visualize_missing_values(df):
msno.matrix(df)
plt.show()
msno.bar(df)
plt.show()
In [324]:
# Handle missing data
def handle_missing_data(df):
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())
print("\nMissing values handled by filling with the median.")
return df
In [325]:
# Remove duplicate rows
def remove_duplicates(df):
duplicates = df.duplicated().sum()
print(f"\nThere are {duplicates} duplicate rows in the dataset.")
df = df.drop_duplicates()
print("Duplicates removed.")
return df
In [326]:
# Remove outliers using Z-score
def remove_outliers(df, column, z_thresh=3):
"""
Remove outliers from a specified column using Z-score.
"""
if column not in df.columns:
print(f"Column '{column}' not found in dataset. Skipping outlier removal.")
return df
z_scores = np.abs(stats.zscore(df[column].dropna()))
df = df[z_scores < z_thresh]
print(f"Outliers removed from column '{column}' using Z-score threshold of {z_thresh}.")
return df
In [327]:
# Display descriptive statistics
def display_descriptive_stats(df):
print("\nDescriptive Statistics:")
print(df.describe())
In [328]:
# Plot the distribution of a numeric column
def plot_distribution(df, column):
if column in df.columns:
plt.figure(figsize=(10, 6))
sns.histplot(df[column], kde=True, bins=30)
plt.title(f'Distribution of {column}')
plt.xlabel(column)
plt.ylabel('Frequency')
plt.show()
else:
print(f"Column '{column}' not found in dataset. Skipping distribution plot.")
In [329]:
# Pairplot for selected columns
def plot_pairplot(df, columns):
selected_columns = [col for col in columns if col in df.columns]
if selected_columns:
sns.pairplot(df[selected_columns])
plt.suptitle("Pairplot of Selected Columns", y=1.02)
plt.show()
else:
print("No valid columns found for pairplot.")
In [330]:
# Trend plot for yearly data
def plot_trend(df, years):
available_years = [year for year in years if year in df.columns]
if available_years:
df[available_years].mean().plot(figsize=(12, 6), marker='o')
plt.title("Yearly Trend of Average Values")
plt.xlabel("Year")
plt.ylabel("Average Value")
plt.show()
else:
print("No valid years found for trend plot.")
In [331]:
# Bar Chart for Categorical Data with Top N Categories
def plot_bar_chart(df, column, top_n=10):
if column in df.columns:
top_categories = df[column].value_counts().head(top_n)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_categories.index, y=top_categories.values)
plt.title(f'Top {top_n} {column} Distribution (Bar Chart)')
plt.xlabel(column)
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
else:
print(f"Column '{column}' not found in dataset. Skipping bar chart.")
In [332]:
# Pie Chart for Gender Unemployment Rate
def plot_gender_unemployment_pie(df, gender_column, rate_column):
if gender_column in df.columns and rate_column in df.columns:
avg_unemployment_by_gender = df.groupby(gender_column)[rate_column].mean()
plt.figure(figsize=(8, 8))
avg_unemployment_by_gender.plot(kind='pie', autopct='%1.1f%%', startangle=90, cmap='Pastel1')
plt.title('Average Unemployment Rate by Gender')
plt.ylabel('')
plt.show()
else:
print(f"Columns '{gender_column}' or '{rate_column}' not found in dataset. Skipping gender unemployment pie chart.")
In [333]:
# Box Plot with Grouping by Category
def plot_box_plot(df, column, group_by=None):
if column in df.columns:
if group_by and group_by in df.columns:
plt.figure(figsize=(10, 6))
sns.boxplot(x=group_by, y=column, data=df)
plt.title(f'{column} by {group_by} (Box Plot)')
plt.xlabel(group_by)
plt.ylabel(column)
plt.show()
else:
plt.figure(figsize=(10, 6))
sns.boxplot(x=df[column])
plt.title(f'{column} Box Plot')
plt.xlabel(column)
plt.show()
else:
print(f"Column '{column}' not found in dataset. Skipping box plot.")
In [334]:
# KDE Plot for a numeric column
def plot_kde(df, column):
if column in df.columns:
plt.figure(figsize=(10, 6))
sns.kdeplot(df[column], shade=True)
plt.title(f'{column} KDE Plot')
plt.xlabel(column)
plt.ylabel('Density')
plt.show()
else:
print(f"Column '{column}' not found in dataset. Skipping KDE plot.")
In [335]:
# Stacked Bar Chart for Yearly Data
def plot_stacked_bar_chart(df, years, group_by=None):
available_years = [year for year in years if year in df.columns]
if available_years:
if group_by and group_by in df.columns:
df_grouped = df.groupby([group_by])[available_years].sum()
df_grouped.plot(kind='bar', stacked=True, figsize=(12, 8))
plt.title(f'Stacked Bar Chart by {group_by} (Yearly Data)')
else:
df[available_years].plot(kind='bar', stacked=True, figsize=(12, 8))
plt.title('Stacked Bar Chart (Yearly Data)')
plt.xlabel('Index')
plt.ylabel('Values')
plt.xticks(rotation=45)
plt.show()
else:
print("No valid years found for stacked bar chart.")
In [336]:
# Interactive Bar Chart for Categorical Data with Plotly
def plot_interactive_bar_chart(df, column, top_n=10):
if column in df.columns:
top_categories = df[column].value_counts().head(top_n)
fig = px.bar(top_categories, x=top_categories.index, y=top_categories.values,
labels={column: 'Category', 'y': 'Count'},
title=f'Top {top_n} {column} Distribution (Interactive Bar Chart)')
fig.show()
else:
print(f"Column '{column}' not found in dataset. Skipping interactive bar chart.")
In [337]:
def plot_unemployment_by_age_group(df, age_group_column, year_columns):
"""
Plots the average unemployment rate for each age group across selected years.
"""
if age_group_column not in df.columns or not all(year in df.columns for year in year_columns):
print("Error: Missing required columns in the dataset.")
return
# Calculate the average unemployment rate for each age group across selected years
age_group_avg = (
df.groupby(age_group_column)[year_columns]
.mean(numeric_only=True)
.mean(axis=1)
.sort_values(ascending=False)
)
# Plot the average unemployment rate by age group as a bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=age_group_avg.index, y=age_group_avg.values, palette='viridis')
plt.title('Average Unemployment Rate by Age Group (Selected Years)', fontsize=16)
plt.xlabel('Age Group', fontsize=14)
plt.ylabel('Average Unemployment Rate', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()
In [338]:
# Correlation Heatmap
def plot_correlation_heatmap(df):
numeric_columns = df.select_dtypes(include='number').columns
correlation_matrix = df[numeric_columns].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
In [339]:
# Enhanced Visualizations
def plot_all_feature_distributions(df):
numeric_columns = df.select_dtypes(include=[np.number]).columns
for column in numeric_columns:
plot_distribution(df, column)
plot_kde(df, column)
def plot_all_boxplots(df):
numeric_columns = df.select_dtypes(include=[np.number]).columns
for column in numeric_columns:
plot_box_plot(df, column)
In [21]:
In [340]:
def plot_scatter_geo(df, rate_col):
if 'country_name' in df.columns and rate_col in df.columns:
fig = px.scatter_geo(df, locations='country_name', locationmode='country names',
size=rate_col, title='Global Unemployment Rates',
color=rate_col, color_continuous_scale='Viridis')
fig.update_layout(margin={'r': 0, 't': 50, 'l': 0, 'b': 0})
fig.show()
else:
print("Error: Required columns for scatter plot are missing.")
In [341]:
def plot_top_20_countries_by_avg_unemployment(df, rate_column='2024', country_column='country_name'):
if country_column in df.columns and rate_column in df.columns:
# Calculate average unemployment rate per country
avg_unemployment = df.groupby(country_column)[rate_column].mean().sort_values(ascending=False).head(20)
# Plot the bar graph
plt.figure(figsize=(12, 8))
sns.barplot(x=avg_unemployment.values, y=avg_unemployment.index, palette='coolwarm')
plt.title('Top 20 Countries by Average Unemployment Rate', fontsize=16)
plt.xlabel('Average Unemployment Rate', fontsize=14)
plt.ylabel('Country', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()
else:
print(f"Columns '{country_column}' or '{rate_column}' not found in the dataset. Skipping the plot.")
In [342]:
def plot_top_10_countries_pie_chart(df, years, country_column='country_name'):
# Check if the country column and selected years exist in the dataset
if country_column in df.columns and all(year in df.columns for year in years):
# Calculate the average unemployment rate across selected years for each country
df['avg_unemployment'] = df[years].mean(axis=1)
# Group by country and calculate the overall average
avg_unemployment = df.groupby(country_column)['avg_unemployment'].mean().sort_values(ascending=False).head(10)
# Plot the pie chart
plt.figure(figsize=(8, 8))
avg_unemployment.plot(kind='pie', autopct='%1.1f%%', startangle=140, cmap='tab10')
plt.title('Top 10 Countries by Average Unemployment Rate Across Selected Years', fontsize=16)
plt.ylabel('')
plt.tight_layout()
plt.show()
else:
missing_years = [year for year in years if year not in df.columns]
print(f"Missing columns: {missing_years}. Please check the dataset.")
In [343]:
def plot_top_10_countries_stacked_bar_chart(df, years, country_column='country_name'):
# Check if the country column and selected years exist in the dataset
if country_column in df.columns and all(year in df.columns for year in years):
# Calculate the average unemployment rate for each country across the selected years
df['avg_unemployment'] = df[years].mean(axis=1)
# Group by the country column and calculate the average unemployment rate
avg_by_country = df.groupby(country_column)['avg_unemployment'].mean()
# Get the top 10 countries by average unemployment rate
top_countries = avg_by_country.sort_values(ascending=False).head(10).index
# Filter the dataset for these top 10 countries
top_countries_df = df[df[country_column].isin(top_countries)]
# Pivot the data to prepare for stacked bar plotting
stacked_data = top_countries_df.groupby(country_column)[years].mean()
# Plot the stacked bar chart
stacked_data.plot(kind='bar', stacked=True, figsize=(12, 8), colormap='tab20')
plt.title('Stacked Bar Chart of Average Unemployment Rate by Year (Top 10 Countries)', fontsize=16)
plt.xlabel('Country', fontsize=12)
plt.ylabel('Unemployment Rate', fontsize=12)
plt.xticks(rotation=45)
plt.legend(title="Years", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
else:
missing_years = [year for year in years if year not in df.columns]
print(f"Missing columns: {missing_years}. Please check the dataset.")
In [344]:
def plot_unemployment_trends(df, country_column, years):
"""
Plots unemployment rate trends over time for each country.
"""
plt.figure(figsize=(12, 8))
for country in df[country_column].unique():
country_data = df[df[country_column] == country]
plt.plot(years, country_data[years].mean(), label=country)
plt.title('Unemployment Rate Trends by Country', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Unemployment Rate', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()
In [345]:
def plot_unemployment_by_gender(df, gender_column, year_columns):
"""
Plots average unemployment rates by gender.
"""
gender_avg = df.groupby(gender_column)[year_columns].mean().mean(axis=1)
plt.figure(figsize=(8, 6))
sns.barplot(x=gender_avg.index, y=gender_avg.values, palette='pastel')
plt.title('Average Unemployment Rate by Gender', fontsize=16)
plt.xlabel('Gender', fontsize=14)
plt.ylabel('Average Unemployment Rate', fontsize=14)
plt.show()
In [346]:
def plot_correlation_with_age_group(df, age_group_column, year_columns):
"""
Plots a heatmap showing the correlation between age groups and unemployment rates.
"""
correlation_data = df.pivot_table(index=age_group_column, values=year_columns, aggfunc='mean').corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_data, annot=True, cmap='coolwarm')
plt.title('Correlation Between Age Groups and Unemployment', fontsize=16)
plt.show()
In [347]:
import plotly.express as px
def plot_high_unemployment_map(df, country_column, year_column):
"""
Plots an interactive map showing unemployment rates across countries.
"""
fig = px.choropleth(df, locations=country_column, locationmode='country names',
color=year_column, title=f'Unemployment Rates in {year_column}',
color_continuous_scale='reds')
fig.show()
In [348]:
def plot_age_gender_unemployment(df, age_column, gender_column, rate_column):
"""
Plots a box plot to compare unemployment rates by age and gender.
"""
plt.figure(figsize=(12, 8))
sns.boxplot(x=age_column, y=rate_column, hue=gender_column, data=df)
plt.title('Unemployment Rate by Age Group and Gender', fontsize=16)
plt.xlabel('Age Group', fontsize=14)
plt.ylabel('Unemployment Rate', fontsize=14)
plt.legend(title='Gender', fontsize=12)
plt.xticks(rotation=45)
plt.show()
In [349]:
def compare_unemployment_by_gender_and_age(df, year='2023'):
"""
Compare unemployment rates across gender and age groups for a given year.
- A bar plot comparing unemployment rates across gender and age groups.
"""
plt.figure(figsize=(12, 6))
sns.barplot(x='sex', y=year, hue='age_group', data=df)
plt.title(f'Unemployment Rate by Gender and Age Group in {year}')
plt.xlabel('Gender')
plt.ylabel('Unemployment Rate %')
plt.show()
In [350]:
def scatterplot_unemployment_by_age(df, year='2024'):
"""
Create a scatterplot to visualize unemployment rates across age categories for a given year.
"""
plt.figure(figsize=(12, 6))
sns.scatterplot(x='age_categories', y=year, data=df)
plt.title(f'Unemployment Rate by Age Categories in {year}')
plt.xlabel('Age Categories')
plt.ylabel('Unemployment Rate %')
plt.show()
In [351]:
def countplot_and_scatterplot_unemployment(df):
"""
Create a countplot to visualize the distribution of unemployment across different age categories,
and a scatterplot to show the relationship between age categories and unemployment rate for 2024.
"""
# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(18, 6))
# Countplot on the first axis
sns.countplot(x='age_categories', data=df, palette='Set2', ax=axes[0])
axes[0].set_title('Distribution of Unemployment Across Age Categories')
axes[0].set_xlabel('Age Categories')
axes[0].set_ylabel('Count')
# Scatterplot on the second axis
sns.scatterplot(x='age_categories', y='2024', data=df, ax=axes[1], palette='Set2')
axes[1].set_title('Relationship Between Age Categories and Unemployment Rate in 2024')
axes[1].set_xlabel('Age Categories')
axes[1].set_ylabel('Unemployment Rate %')
plt.tight_layout() # Adjust layout to prevent overlap
plt.show()
In [352]:
def plot_violinplot(df, column, by):
sns.violinplot(x=by, y=column, data=df)
plt.xlabel(by.capitalize())
plt.ylabel(column.capitalize())
plt.title(f'Distribution of {column.capitalize()} by {by.capitalize()}')
plt.show()
In [353]:
# Main function
def main():
filepath = 'global_unemployment_data.csv' # Update with your file path
df = load_dataset(filepath)
if df is not None:
summarize_dataset(df)
visualize_missing_values(df)
df = handle_missing_data(df)
df = remove_duplicates(df)
# Define years for analysis
years = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']
country_column = 'country_name'
gender_column = 'sex'
age_group_column = 'age_group'
rate_column = '2024'
# Perform EDA
display_descriptive_stats(df)
df = remove_outliers(df, '2024') # Remove outliers for the year 2024
plot_distribution(df, '2024') # Plot distribution of data for 2024
plot_pairplot(df, ['2020', '2021', '2022', '2023'])
plot_trend(df, years) # Plot trend over years
plot_correlation_heatmap(df) # Plot correlation heatmap
# Additional visualizations
plot_box_plot(df, '2024', group_by='sex') # Box plot grouped by sex
plot_kde(df, '2024')
plot_scatter_geo(df, '2024')
plot_top_20_countries_by_avg_unemployment(df, rate_column='2024', country_column='country_name')
plot_top_10_countries_pie_chart(df, years, country_column='country_name')
plot_top_10_countries_stacked_bar_chart(df, years, country_column='country_name')
# Plot gender unemployment pie chart
plot_gender_unemployment_pie(df, 'sex', '2024')
# Enhanced visualizations
plot_all_feature_distributions(df)
plot_all_boxplots(df)
print("Plotting Unemployment Trends by Country...")
plot_unemployment_trends(df, country_column, years)
print("Plotting Average Unemployment by Gender...")
plot_unemployment_by_gender(df, gender_column, years)
print("Plotting Correlation Between Age Group and Unemployment...")
plot_correlation_with_age_group(df, age_group_column, years)
print("Plotting High Unemployment Map (Interactive)...")
for year in years: # Interactive map for each year
plot_high_unemployment_map(df, country_column, year)
print("Plotting Comparison of Age and Gender for Unemployment...")
plot_age_gender_unemployment(df, age_group_column, gender_column, rate_column)
compare_unemployment_by_gender_and_age(df)
scatterplot_unemployment_by_age(df)
countplot_and_scatterplot_unemployment(df)
if __name__ == "__main__":
main()
Dataset loaded successfully.
First 5 rows of the dataset:
country_name indicator_name sex age_group \
0 Afghanistan Unemployment rate by sex and age Female 15-24
1 Afghanistan Unemployment rate by sex and age Female 25+
2 Afghanistan Unemployment rate by sex and age Female Under 15
3 Afghanistan Unemployment rate by sex and age Male 15-24
4 Afghanistan Unemployment rate by sex and age Male 25+
age_categories 2014 2015 2016 2017 2018 2019 2020 \
0 Youth 13.340 15.974 18.570 21.137 20.649 20.154 21.228
1 Adults 8.576 9.014 9.463 9.920 11.223 12.587 14.079
2 Children 10.306 11.552 12.789 14.017 14.706 15.418 16.783
3 Youth 9.206 11.502 13.772 16.027 15.199 14.361 14.452
4 Adults 6.463 6.879 7.301 7.728 7.833 7.961 8.732
2021 2022 2023 2024
0 21.640 30.561 32.200 33.332
1 14.415 23.818 26.192 28.298
2 17.134 26.746 29.193 30.956
3 15.099 16.655 18.512 19.770
4 9.199 11.357 12.327 13.087
Shape of the dataset: (1134, 16)
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1134 entries, 0 to 1133
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 country_name 1134 non-null object
1 indicator_name 1134 non-null object
2 sex 1134 non-null object
3 age_group 1134 non-null object
4 age_categories 1134 non-null object
5 2014 1134 non-null float64
6 2015 1134 non-null float64
7 2016 1134 non-null float64
8 2017 1134 non-null float64
9 2018 1134 non-null float64
10 2019 1134 non-null float64
11 2020 1134 non-null float64
12 2021 1134 non-null float64
13 2022 1128 non-null float64
14 2023 1122 non-null float64
15 2024 1122 non-null float64
dtypes: float64(11), object(5)
memory usage: 141.9+ KB
None
Missing values in each column:
country_name 0
indicator_name 0
sex 0
age_group 0
age_categories 0
2014 0
2015 0
2016 0
2017 0
2018 0
2019 0
2020 0
2021 0
2022 6
2023 12
2024 12
dtype: int64
Missing values handled by filling with the median.
There are 0 duplicate rows in the dataset.
Duplicates removed.
Descriptive Statistics:
2014 2015 2016 2017 2018 \
count 1134.000000 1134.000000 1134.000000 1134.000000 1134.000000
mean 11.387800 11.272444 11.122963 10.863516 10.516499
std 11.119002 10.915942 10.742947 10.640980 10.527773
min 0.027000 0.034000 0.038000 0.035000 0.044000
25% 3.933500 3.993500 3.945250 3.747500 3.672750
50% 7.697500 7.547500 7.504500 7.140500 6.706000
75% 15.050750 14.766250 14.467500 14.142000 13.343000
max 74.485000 74.655000 74.720000 75.416000 76.395000
2019 2020 2021 2022 2023 \
count 1134.000000 1134.000000 1134.000000 1134.000000 1134.000000
mean 10.311452 11.851285 11.422645 10.320420 9.947941
std 10.297952 11.231580 10.873412 10.241248 9.941275
min 0.036000 0.056000 0.064000 0.067000 0.063000
25% 3.538500 4.334500 4.153500 3.568500 3.484500
50% 6.627500 8.067500 7.542500 6.571500 6.466000
75% 13.285500 15.316250 14.881500 13.362750 12.824500
max 77.173000 83.990000 82.135000 78.776000 78.541000
2024
count 1134.000000
mean 9.902247
std 9.931283
min 0.060000
25% 3.484500
50% 6.364000
75% 12.571000
max 78.644000
Outliers removed from column '2024' using Z-score threshold of 3.
Plotting Unemployment Trends by Country...
Plotting Average Unemployment by Gender...
Plotting Correlation Between Age Group and Unemployment...
Plotting High Unemployment Map (Interactive)...
Plotting Comparison of Age and Gender for Unemployment...
In [ ]:
In [354]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load your dataset (make sure to replace the file path with your actual file)
data = pd.read_csv('global_unemployment_data.csv')
# Filter data for a specific year (e.g., 2024) and specific variables
data_2024 = data[['country_name', 'age_categories', 'sex', '2024']]
# Create a violin plot
plt.figure(figsize=(12, 6))
sns.violinplot(x='age_categories', y='2024', hue='sex', data=data_2024, split=True)
# Add labels and title
plt.title('Unemployment Distribution by Age Category and Gender (2024)')
plt.xlabel('Age Category')
plt.ylabel('Unemployment Rate')
plt.legend(title='Gender')
# Display the plot
plt.show()
In [355]:
import seaborn as sns
import matplotlib.pyplot as plt
# Plot histogram for unemployment rates in 2024
plt.figure(figsize=(10, 6))
sns.histplot(data['2024'], kde=True, bins=20, color='skyblue')
plt.title('Distribution of Unemployment Rates in 2024')
plt.xlabel('Unemployment Rate')
plt.ylabel('Frequency')
plt.show()
In [356]:
# Box plot to show the distribution of unemployment rates for different age categories in 2024
plt.figure(figsize=(12, 6))
sns.boxplot(x='age_categories', y='2024', data=data, palette='Set2')
plt.title('Unemployment Rate Distribution by Age Categories (2024)')
plt.xlabel('Age Category')
plt.ylabel('Unemployment Rate')
plt.show()
In [357]:
# Scatter plot to compare unemployment rates for Youth (15-24) and Adults (25+)
youth_data = data[data['age_group'] == '15-24']
adult_data = data[data['age_group'] == '25+']
plt.figure(figsize=(10, 6))
sns.scatterplot(x='2024', y='2024', data=youth_data, label='Youth (15-24)', color='orange')
sns.scatterplot(x='2024', y='2024', data=adult_data, label='Adults (25+)', color='blue')
plt.title('Unemployment Rates (Youth vs Adults) in 2024')
plt.xlabel('Unemployment Rate')
plt.ylabel('Unemployment Rate')
plt.legend()
plt.show()
In [358]:
# Pair plot to visualize relationships between different age groups (2014-2024)
sns.pairplot(data[['age_categories', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']], hue='age_categories')
plt.title('Pair Plot of Unemployment Rates (2014-2024) by Age Categories')
plt.show()
In [359]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv('global_unemployment_data.csv')
# Reshape the data to long format using pd.melt
data_long = pd.melt(data, id_vars=['country_name', 'indicator_name', 'sex', 'age_group', 'age_categories'],
value_vars=[str(year) for year in range(2014, 2025)],
var_name='Year', value_name='Unemployment Rate')
# Convert 'Year' to numeric (if it's not already)
data_long['Year'] = pd.to_numeric(data_long['Year'])
# Line plot for unemployment rate trends by age group across years (2014-2024)
plt.figure(figsize=(12, 6))
sns.lineplot(x='Year', y='Unemployment Rate', hue='age_group', data=data_long, marker='o')
plt.title('Unemployment Rate Trends by Age Group (2014-2024)')
plt.xlabel('Year')
plt.ylabel('Unemployment Rate')
plt.legend(title='Age Group')
plt.show()
In [360]:
import plotly.express as px
# Treemap for unemployment rate by country, age group, and year
treemap_data = data_long.groupby(['country_name', 'age_group', 'Year'])['Unemployment Rate'].mean().reset_index()
fig = px.treemap(treemap_data,
path=['country_name', 'age_group', 'Year'],
values='Unemployment Rate',
color='Unemployment Rate',
color_continuous_scale='RdBu')
fig.update_layout(title='Treemap of Unemployment Rates by Country, Age Group, and Year')
fig.show()
In [361]:
# Filter data for the years 2014 and 2024
slope_data = data_long[data_long['Year'].isin([2014, 2024])]
# Slope graph for gender comparison between 2014 and 2024
plt.figure(figsize=(12, 6))
sns.lineplot(x='Year', y='Unemployment Rate', hue='sex', style='sex', markers=True, data=slope_data)
plt.title('Slope Graph: Unemployment Rate from 2014 to 2024 by Gender')
plt.xlabel('Year')
plt.ylabel('Unemployment Rate')
plt.legend(title='Gender')
plt.show()
In [362]:
# Facet grid for gender over time
g = sns.FacetGrid(data_long, col='sex', height=6, aspect=1.5)
g.map(sns.lineplot, 'Year', 'Unemployment Rate', marker='o')
g.set_axis_labels('Year', 'Unemployment Rate')
g.set_titles('Unemployment Rate by Gender')
plt.show()
In [363]:
# Create a correlation matrix for unemployment rates across years
corr_data = data[['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']].corr()
# Heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_data, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Unemployment Rates (2014-2024)')
plt.show()
In [364]:
# Create a pivot table for heatmap (average unemployment rate by country and year)
heatmap_data = data_long.pivot_table(index='country_name', columns='Year', values='Unemployment Rate', aggfunc='mean')
# Heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(heatmap_data, annot=False, cmap='YlGnBu', fmt='.2f', linewidths=0.5)
plt.title('Heatmap of Unemployment Rates by Country (2014-2024)')
plt.xlabel('Year')
plt.ylabel('Country')
plt.show()
In [365]:
# Create a pivot table to reshape data for stacked area plot
data_pivot = data_long.pivot_table(index='Year', columns='sex', values='Unemployment Rate', aggfunc='mean')
# Stacked area plot
data_pivot.plot.area(figsize=(12, 6), cmap='Set2', alpha=0.6)
plt.title('Stacked Area Plot of Unemployment Rates by Gender (2014-2024)')
plt.xlabel('Year')
plt.ylabel('Unemployment Rate')
plt.show()
In [366]:
# Calculate the 3-year moving average for each group
data_long['Moving Avg'] = data_long.groupby('age_group')['Unemployment Rate'].transform(lambda x: x.rolling(3).mean())
# Line plot with moving average
plt.figure(figsize=(12, 6))
# Plot the original data
sns.lineplot(x='Year', y='Unemployment Rate', hue='age_group', data=data_long, marker='o', linestyle='--')
# Plot the moving average
sns.lineplot(x='Year', y='Moving Avg', hue='age_group', data=data_long, marker='o', linestyle='-', linewidth=2)
# Add title and labels
plt.title('Unemployment Rate Trends with 3-Year Moving Average')
plt.xlabel('Year')
plt.ylabel('Unemployment Rate')
# Display the legend
plt.legend(title='Age Group')
# Show the plot
plt.show()
In [367]:
# Facet grid of line plots by 'sex' and 'age_group'
g = sns.FacetGrid(data_long, col='sex', row='age_group', margin_titles=True)
g.map(sns.lineplot, 'Year', 'Unemployment Rate', marker='o')
g.set_axis_labels('Year', 'Unemployment Rate')
g.set_titles(col_template='{col_name}', row_template='{row_name}')
plt.show()
In [371]:
df=pd.read_csv('global_unemployment_data.csv')
df.describe()
Out[371]:
| 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 | 2023 | 2024 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1134.000000 | 1134.000000 | 1134.000000 | 1134.000000 | 1134.000000 | 1134.000000 | 1134.000000 | 1134.000000 | 1128.000000 | 1122.000000 | 1122.000000 |
| mean | 11.387800 | 11.272444 | 11.122963 | 10.863516 | 10.516499 | 10.311452 | 11.851285 | 11.422645 | 10.340361 | 9.985181 | 9.940089 |
| std | 11.119002 | 10.915942 | 10.742947 | 10.640980 | 10.527773 | 10.297952 | 11.231580 | 10.873412 | 10.264810 | 9.987778 | 9.977512 |
| min | 0.027000 | 0.034000 | 0.038000 | 0.035000 | 0.044000 | 0.036000 | 0.056000 | 0.064000 | 0.067000 | 0.063000 | 0.060000 |
| 25% | 3.933500 | 3.993500 | 3.945250 | 3.747500 | 3.672750 | 3.538500 | 4.334500 | 4.153500 | 3.555250 | 3.477500 | 3.459750 |
| 50% | 7.697500 | 7.547500 | 7.504500 | 7.140500 | 6.706000 | 6.627500 | 8.067500 | 7.542500 | 6.571500 | 6.466000 | 6.364000 |
| 75% | 15.050750 | 14.766250 | 14.467500 | 14.142000 | 13.343000 | 13.285500 | 15.316250 | 14.881500 | 13.410000 | 12.914500 | 12.687750 |
| max | 74.485000 | 74.655000 | 74.720000 | 75.416000 | 76.395000 | 77.173000 | 83.990000 | 82.135000 | 78.776000 | 78.541000 | 78.644000 |
In [379]:
import pandas as pd
# Load the dataset (replace 'global_unemployment_data.csv' with your file path)
data = pd.read_csv('global_unemployment_data.csv')
# Select columns containing numeric data (assumed to represent years)
year_columns = data.select_dtypes(include=['float64', 'int64']).columns
# Dictionary to store results for each year
all_years_statistics = {}
# Function to calculate statistics for a given column
def calculate_statistics(series):
# Drop missing values for accurate calculations
series = series.dropna()
# Basic statistics
mean_value = series.mean()
median_value = series.median()
mode_value = series.mode().iloc[0] if not series.mode().empty else None
std_deviation = series.std()
data_range = series.max() - series.min()
variance_value = series.var()
q1 = series.quantile(0.25)
q3 = series.quantile(0.75)
quartile_deviation = (q3 - q1) / 2
mean_deviation = (series - mean_value).abs().mean()
coefficient_of_variation = (std_deviation / mean_value) * 100 if mean_value != 0 else None
coefficient_of_standard_deviation = std_deviation / mean_value if mean_value != 0 else None
coefficient_of_mean_deviation = mean_deviation / mean_value if mean_value != 0 else None
coefficient_of_quartile_deviation = quartile_deviation / mean_value if mean_value != 0 else None
coefficient_of_range = data_range / mean_value if mean_value != 0 else None
# Additional statistics
min_value = series.min()
max_value = series.max()
skewness = series.skew()
kurtosis = series.kurt()
iqr = q3 - q1
# Determine skewness type
if skewness > 0:
skewness_type = "Positively Skewed"
elif skewness < 0:
skewness_type = "Negatively Skewed"
else:
skewness_type = "Symmetric"
# Determine kurtosis type
if kurtosis > 0:
kurtosis_type = "Leptokurtic"
elif kurtosis < 0:
kurtosis_type = "Platykurtic"
else:
kurtosis_type = "Mesokurtic"
# Compile results
return {
"Mean": mean_value,
"Median": median_value,
"Mode": mode_value,
"Standard Deviation": std_deviation,
"Range": data_range,
"Variance": variance_value,
"Quartile Deviation": quartile_deviation,
"Mean Deviation": mean_deviation,
"Coefficient of Variation (%)": coefficient_of_variation,
"Coefficient of Standard Deviation": coefficient_of_standard_deviation,
"Coefficient of Mean Deviation": coefficient_of_mean_deviation,
"Coefficient of Quartile Deviation": coefficient_of_quartile_deviation,
"Coefficient of Range": coefficient_of_range,
"Minimum Value": min_value,
"Maximum Value": max_value,
"Skewness": skewness,
"Kurtosis": kurtosis,
"Interquartile Range (IQR)": iqr,
"Skewness Type": skewness_type,
"Kurtosis Type": kurtosis_type,
}
# Calculate statistics for each year column
for year in year_columns:
all_years_statistics[year] = calculate_statistics(data[year])
# Save results to a CSV file
results_df = pd.DataFrame(all_years_statistics).T
results_df.index.name = "Year"
results_df.to_csv('yearly_statistics.csv')
# Display results in the console
print("Yearly Statistics:")
print("=" * 50)
for year, stats in all_years_statistics.items():
print(f"Year: {year}")
for stat_name, value in stats.items():
print(f" {stat_name}: {value}")
print("-" * 50)
# Optionally, print the full statistics table
print("\nFull Statistics Table:")
print(results_df)
Yearly Statistics:
==================================================
Year: 2014
Mean: 11.387799823633156
Median: 7.6975
Mode: 2.264
Standard Deviation: 11.11900168293138
Range: 74.458
Variance: 123.63219842503086
Quartile Deviation: 5.558625
Mean Deviation: 8.043298890475256
Coefficient of Variation (%): 97.63959548934169
Coefficient of Standard Deviation: 0.9763959548934169
Coefficient of Mean Deviation: 0.7063084191015511
Coefficient of Quartile Deviation: 0.4881210669390376
Coefficient of Range: 6.538400845919065
Minimum Value: 0.027
Maximum Value: 74.485
Skewness: 2.026923616811268
Kurtosis: 5.07124149963963
Interquartile Range (IQR): 11.11725
Skewness Type: Positively Skewed
Kurtosis Type: Leptokurtic
--------------------------------------------------
Year: 2015
Mean: 11.272444444444444
Median: 7.547499999999999
Mode: 0.269
Standard Deviation: 10.915942212254908
Range: 74.621
Variance: 119.1577943812886
Quartile Deviation: 5.386374999999999
Mean Deviation: 7.883938467568097
Coefficient of Variation (%): 96.83740084995286
Coefficient of Standard Deviation: 0.9683740084995286
Coefficient of Mean Deviation: 0.6993991859018341
Coefficient of Quartile Deviation: 0.47783557741592075
Coefficient of Range: 6.619770926152269
Minimum Value: 0.034
Maximum Value: 74.655
Skewness: 2.0397897396927407
Kurtosis: 5.227957583653392
Interquartile Range (IQR): 10.772749999999998
Skewness Type: Positively Skewed
Kurtosis Type: Leptokurtic
--------------------------------------------------
Year: 2016
Mean: 11.122962962962964
Median: 7.5045
Mode: 1.019
Standard Deviation: 10.742947239204366
Range: 74.682
Variance: 115.41091538432872
Quartile Deviation: 5.261125000000001
Mean Deviation: 7.8123887909073115
Coefficient of Variation (%): 96.58350274990605
Coefficient of Standard Deviation: 0.9658350274990605
Coefficient of Mean Deviation: 0.7023658009939311
Coefficient of Quartile Deviation: 0.4729967201651572
Coefficient of Range: 6.714218167288226
Minimum Value: 0.038
Maximum Value: 74.72
Skewness: 1.992407314807538
Kurtosis: 4.9500609895600025
Interquartile Range (IQR): 10.522250000000001
Skewness Type: Positively Skewed
Kurtosis Type: Leptokurtic
--------------------------------------------------
Year: 2017
Mean: 10.863515873015873
Median: 7.1405
Mode: 1.101
Standard Deviation: 10.640979566858013
Range: 75.381
Variance: 113.23044614228975
Quartile Deviation: 5.1972499999999995
Mean Deviation: 7.689108101676885
Coefficient of Variation (%): 97.95152592623698
Coefficient of Standard Deviation: 0.9795152592623698
Coefficient of Mean Deviation: 0.7077918596111256
Coefficient of Quartile Deviation: 0.4784132559615956
Coefficient of Range: 6.938913780872777
Minimum Value: 0.035
Maximum Value: 75.416
Skewness: 2.093915248078852
Kurtosis: 5.653766586471949
Interquartile Range (IQR): 10.394499999999999
Skewness Type: Positively Skewed
Kurtosis Type: Leptokurtic
--------------------------------------------------
Year: 2018
Mean: 10.516499118165786
Median: 6.706
Mode: 4.523
Standard Deviation: 10.527772630230887
Range: 76.351
Variance: 110.83399655383857
Quartile Deviation: 4.835125
Mean Deviation: 7.558850691625532
Coefficient of Variation (%): 100.10719833604729
Coefficient of Standard Deviation: 1.0010719833604729
Coefficient of Mean Deviation: 0.7187611206631179
Coefficient of Quartile Deviation: 0.45976564498046646
Coefficient of Range: 7.260115666069357
Minimum Value: 0.044
Maximum Value: 76.395
Skewness: 2.1651873874465553
Kurtosis: 6.13819307785424
Interquartile Range (IQR): 9.67025
Skewness Type: Positively Skewed
Kurtosis Type: Leptokurtic
--------------------------------------------------
Year: 2019
Mean: 10.311452380952382
Median: 6.6274999999999995
Mode: 0.994
Standard Deviation: 10.297952333538843
Range: 77.137
Variance: 106.0478222638381
Quartile Deviation: 4.8735
Mean Deviation: 7.404380532459898
Coefficient of Variation (%): 99.86907714922414
Coefficient of Standard Deviation: 0.9986907714922414
Coefficient of Mean Deviation: 0.718073483628503
Coefficient of Quartile Deviation: 0.4726298313710368
Coefficient of Range: 7.480711460442734
Minimum Value: 0.036
Maximum Value: 77.173
Skewness: 2.181476975996898
Kurtosis: 6.4373539591609745
Interquartile Range (IQR): 9.747
Skewness Type: Positively Skewed
Kurtosis Type: Leptokurtic
--------------------------------------------------
Year: 2020
Mean: 11.8512848324515
Median: 8.067499999999999
Mode: 2.034
Standard Deviation: 11.231580456574875
Range: 83.934
Variance: 126.14839955251466
Quartile Deviation: 5.490875
Mean Deviation: 8.151791513862062
Coefficient of Variation (%): 94.77099416107413
Coefficient of Standard Deviation: 0.9477099416107413
Coefficient of Mean Deviation: 0.6878403168186973
Coefficient of Quartile Deviation: 0.46331474415033397
Coefficient of Range: 7.082270081820134
Minimum Value: 0.056
Maximum Value: 83.99
Skewness: 2.069508253743997
Kurtosis: 5.718613005852259
Interquartile Range (IQR): 10.98175
Skewness Type: Positively Skewed
Kurtosis Type: Leptokurtic
--------------------------------------------------
Year: 2021
Mean: 11.422644620811287
Median: 7.5425
Mode: 1.881
Standard Deviation: 10.873412130631372
Range: 82.07100000000001
Variance: 118.2310913625615
Quartile Deviation: 5.364
Mean Deviation: 7.87412889865594
Coefficient of Variation (%): 95.19172215881382
Coefficient of Standard Deviation: 0.9519172215881382
Coefficient of Mean Deviation: 0.6893437693325247
Coefficient of Quartile Deviation: 0.46959352917512237
Coefficient of Range: 7.1849385781005735
Minimum Value: 0.064
Maximum Value: 82.135
Skewness: 2.1008939871144876
Kurtosis: 5.8841748201274715
Interquartile Range (IQR): 10.728
Skewness Type: Positively Skewed
Kurtosis Type: Leptokurtic
--------------------------------------------------
Year: 2022
Mean: 10.340360815602837
Median: 6.5715
Mode: 3.671
Standard Deviation: 10.264809991760487
Range: 78.709
Variance: 105.36632416694593
Quartile Deviation: 4.927375
Mean Deviation: 7.3887414334037524
Coefficient of Variation (%): 99.2693598880191
Coefficient of Standard Deviation: 0.9926935988801909
Coefficient of Mean Deviation: 0.7145535407482774
Coefficient of Quartile Deviation: 0.4765186716274887
Coefficient of Range: 7.611823359319722
Minimum Value: 0.067
Maximum Value: 78.776
Skewness: 2.1918754214666936
Kurtosis: 6.521714541764725
Interquartile Range (IQR): 9.85475
Skewness Type: Positively Skewed
Kurtosis Type: Leptokurtic
--------------------------------------------------
Year: 2023
Mean: 9.98518092691622
Median: 6.466
Mode: 2.696
Standard Deviation: 9.98777793910204
Range: 78.478
Variance: 99.7557081608134
Quartile Deviation: 4.718500000000001
Mean Deviation: 7.117426789124336
Coefficient of Variation (%): 100.02600866428789
Coefficient of Standard Deviation: 1.0002600866428788
Coefficient of Mean Deviation: 0.7127989809316806
Coefficient of Quartile Deviation: 0.4725502757071465
Coefficient of Range: 7.859446971907478
Minimum Value: 0.063
Maximum Value: 78.541
Skewness: 2.2849541832920033
Kurtosis: 7.2396529866501576
Interquartile Range (IQR): 9.437000000000001
Skewness Type: Positively Skewed
Kurtosis Type: Leptokurtic
--------------------------------------------------
Year: 2024
Mean: 9.940089126559716
Median: 6.364
Mode: 0.908
Standard Deviation: 9.977511954850113
Range: 78.584
Variance: 99.55074480917693
Quartile Deviation: 4.614
Mean Deviation: 7.099028437886256
Coefficient of Variation (%): 100.37648383041562
Coefficient of Standard Deviation: 1.0037648383041562
Coefficient of Mean Deviation: 0.7141815679416592
Coefficient of Quartile Deviation: 0.4641809486065357
Coefficient of Range: 7.905764123384483
Minimum Value: 0.06
Maximum Value: 78.644
Skewness: 2.295452117498033
Kurtosis: 7.324325866032145
Interquartile Range (IQR): 9.228
Skewness Type: Positively Skewed
Kurtosis Type: Leptokurtic
--------------------------------------------------
Full Statistics Table:
Mean Median Mode Standard Deviation Range Variance \
Year
2014 11.3878 7.6975 2.264 11.119002 74.458 123.632198
2015 11.272444 7.5475 0.269 10.915942 74.621 119.157794
2016 11.122963 7.5045 1.019 10.742947 74.682 115.410915
2017 10.863516 7.1405 1.101 10.64098 75.381 113.230446
2018 10.516499 6.706 4.523 10.527773 76.351 110.833997
2019 10.311452 6.6275 0.994 10.297952 77.137 106.047822
2020 11.851285 8.0675 2.034 11.23158 83.934 126.1484
2021 11.422645 7.5425 1.881 10.873412 82.071 118.231091
2022 10.340361 6.5715 3.671 10.26481 78.709 105.366324
2023 9.985181 6.466 2.696 9.987778 78.478 99.755708
2024 9.940089 6.364 0.908 9.977512 78.584 99.550745
Quartile Deviation Mean Deviation Coefficient of Variation (%) \
Year
2014 5.558625 8.043299 97.639595
2015 5.386375 7.883938 96.837401
2016 5.261125 7.812389 96.583503
2017 5.19725 7.689108 97.951526
2018 4.835125 7.558851 100.107198
2019 4.8735 7.404381 99.869077
2020 5.490875 8.151792 94.770994
2021 5.364 7.874129 95.191722
2022 4.927375 7.388741 99.26936
2023 4.7185 7.117427 100.026009
2024 4.614 7.099028 100.376484
Coefficient of Standard Deviation Coefficient of Mean Deviation \
Year
2014 0.976396 0.706308
2015 0.968374 0.699399
2016 0.965835 0.702366
2017 0.979515 0.707792
2018 1.001072 0.718761
2019 0.998691 0.718073
2020 0.94771 0.68784
2021 0.951917 0.689344
2022 0.992694 0.714554
2023 1.00026 0.712799
2024 1.003765 0.714182
Coefficient of Quartile Deviation Coefficient of Range Minimum Value \
Year
2014 0.488121 6.538401 0.027
2015 0.477836 6.619771 0.034
2016 0.472997 6.714218 0.038
2017 0.478413 6.938914 0.035
2018 0.459766 7.260116 0.044
2019 0.47263 7.480711 0.036
2020 0.463315 7.08227 0.056
2021 0.469594 7.184939 0.064
2022 0.476519 7.611823 0.067
2023 0.47255 7.859447 0.063
2024 0.464181 7.905764 0.06
Maximum Value Skewness Kurtosis Interquartile Range (IQR) \
Year
2014 74.485 2.026924 5.071241 11.11725
2015 74.655 2.03979 5.227958 10.77275
2016 74.72 1.992407 4.950061 10.52225
2017 75.416 2.093915 5.653767 10.3945
2018 76.395 2.165187 6.138193 9.67025
2019 77.173 2.181477 6.437354 9.747
2020 83.99 2.069508 5.718613 10.98175
2021 82.135 2.100894 5.884175 10.728
2022 78.776 2.191875 6.521715 9.85475
2023 78.541 2.284954 7.239653 9.437
2024 78.644 2.295452 7.324326 9.228
Skewness Type Kurtosis Type
Year
2014 Positively Skewed Leptokurtic
2015 Positively Skewed Leptokurtic
2016 Positively Skewed Leptokurtic
2017 Positively Skewed Leptokurtic
2018 Positively Skewed Leptokurtic
2019 Positively Skewed Leptokurtic
2020 Positively Skewed Leptokurtic
2021 Positively Skewed Leptokurtic
2022 Positively Skewed Leptokurtic
2023 Positively Skewed Leptokurtic
2024 Positively Skewed Leptokurtic
In [382]:
import pandas as pd
import numpy as np
data = pd.read_csv('global_unemployment_data.csv')
# Select numeric columns for analysis
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
# Function to calculate moments
def calculate_moments(series):
# Drop missing values for accurate calculations
series = series.dropna()
mean_value = np.mean(series)
# Raw moments about the mean
moment_2 = np.mean((series - mean_value) ** 2) # 2nd raw moment (variance)
moment_3 = np.mean((series - mean_value) ** 3) # 3rd raw moment
moment_4 = np.mean((series - mean_value) ** 4) # 4th raw moment
# Moments about an arbitrary origin (e.g., 50th percentile)
origin = np.percentile(series, 50)
moment_about_origin = np.mean((series - origin) ** 2)
# Central moment
central_moment = np.mean(series ** 2)
return {
'Moment_2 (Variance)': moment_2,
'Moment_3': moment_3,
'Moment_4': moment_4,
'Moment_About_Origin': moment_about_origin,
'Central_Moment': central_moment
}
# Dictionary to store moments for each numeric column
moments_results = {}
# Calculate moments for each numeric column
for col in numeric_columns:
moments_results[col] = calculate_moments(data[col])
# Convert results to a DataFrame
moments_df = pd.DataFrame(moments_results).T
moments_df.index.name = "Column"
# Save moments to a CSV file
moments_df.to_csv('moments_statistics.csv')
# Display results
print("Moments Statistics:")
print("=" * 50)
print(moments_df)
Moments Statistics:
==================================================
Moment_2 (Variance) Moment_3 Moment_4 Moment_About_Origin \
Column
2014 123.523175 2778.977295 122729.393895 137.141488
2015 119.052717 2646.179583 116218.126336 132.927928
2016 115.309142 2463.761609 105345.458407 128.402416
2017 113.130596 2516.252126 110369.194737 126.991443
2018 110.736259 2519.736962 111660.976011 125.256163
2019 105.954306 2376.038607 105569.113315 119.525811
2020 126.037158 2924.421768 138014.330317 140.354185
2021 118.126831 2693.716667 123533.970968 133.182353
2022 105.272914 2364.355181 105144.448233 119.477226
2023 99.666799 2270.502852 101342.145315 112.051434
2024 99.462019 2273.908211 101760.041940 112.250432
Central_Moment
Column
2014 253.205160
2015 246.120721
2016 239.029447
2017 231.146573
2018 221.333013
2019 212.280356
2020 266.490110
2021 248.603641
2022 212.195976
2023 199.370637
2024 198.267390
In [380]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset
data = pd.read_csv('global_unemployment_data.csv')
# Select columns containing numeric data
year_columns = data.select_dtypes(include=['float64', 'int64']).columns
# Function to visualize skewness
def visualize_skewness(data, column_name):
plt.figure(figsize=(10, 6))
sns.histplot(data[column_name].dropna(), kde=True, bins=30, color='skyblue', label="Histogram")
plt.axvline(data[column_name].mean(), color='red', linestyle='--', linewidth=2, label="Mean")
plt.axvline(data[column_name].median(), color='green', linestyle='-', linewidth=2, label="Median")
plt.title(f"Distribution of {column_name}", fontsize=16)
plt.xlabel(column_name, fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.legend()
plt.grid(True)
plt.show()
# Visualize skewness for each year column
for year in year_columns:
visualize_skewness(data, year)
In [ ]:
In [381]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset (replace 'global_unemployment_data.csv' with your file path)
data = pd.read_csv('global_unemployment_data.csv')
# Select columns containing numeric data (assumed to represent years)
year_columns = data.select_dtypes(include=['float64', 'int64']).columns
# Function to visualize kurtosis
def visualize_kurtosis(data, column_name):
plt.figure(figsize=(10, 6))
sns.histplot(data[column_name].dropna(), kde=True, bins=30, color='skyblue', label="Histogram")
plt.title(f"Kurtosis Visualization for {column_name}", fontsize=16)
plt.xlabel(column_name, fontsize=14)
plt.ylabel("Frequency", fontsize=14)
# Calculate kurtosis
kurtosis_value = data[column_name].dropna().kurt()
if kurtosis_value > 0:
kurtosis_type = "Leptokurtic"
elif kurtosis_value < 0:
kurtosis_type = "Platykurtic"
else:
kurtosis_type = "Mesokurtic"
# Add a text box with kurtosis information
plt.text(
x=0.95, y=0.95,
s=f"Kurtosis: {kurtosis_value:.2f}\nType: {kurtosis_type}",
fontsize=12, color='black',
transform=plt.gca().transAxes,
verticalalignment='top', horizontalalignment='right',
bbox=dict(facecolor='white', alpha=0.7, edgecolor='black')
)
plt.legend()
plt.grid(True)
plt.show()
# Visualize kurtosis for each year column
for year in year_columns:
print(f"Visualizing kurtosis for {year}...")
visualize_kurtosis(data, year)
Visualizing kurtosis for 2014...
Visualizing kurtosis for 2015...
Visualizing kurtosis for 2016...
Visualizing kurtosis for 2017...
Visualizing kurtosis for 2018...
Visualizing kurtosis for 2019...
Visualizing kurtosis for 2020...
Visualizing kurtosis for 2021...
Visualizing kurtosis for 2022...
Visualizing kurtosis for 2023...
Visualizing kurtosis for 2024...
In [ ]:
In [ ]: